Facility name, address, latitude and longitude coordinates, and industry sector codes
Chemical identification and classification information
Quantities of chemicals released on site at the facility
Quantities of chemicals transferred off site to other locations for release/disposal and further waste management
Quantities of chemicals managed through disposal, treatment, energy recovery, and recycling
Perform basic EDA on the data to find possible features to use in clustering, and gather an overall understanding of current pollutants in the US
Attempt different clustering methods (PCA and MFA pre-clustering) to find ways to group facilities that would be beneficial to legislation and regulation
Develop an analysis/reporting tool (Shiny App) to visualize and display metrics in both a statewise and facility-based interface
library(dplyr)
library(readr)
library(DataExplorer)
library(cluster)
library(factoextra)
library(FactoMineR)
library(janitor)
library(ggplot2)
library(tidyr)
library(reshape2)
library(stringr)
library(scales)
library(plotly)
knitr::opts_chunk$set(echo = TRUE)
toxins_2023 <- read_csv("C:/Users/aidan/OneDrive/Documents/GitHub/STAT-442-Final-Project/Datasets/2023_us.csv")
## Rows: 77964 Columns: 122
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (31): 2. TRIFD, 4. FACILITY NAME, 5. STREET ADDRESS, 6. CITY, 7. COUNTY,...
## dbl (84): 1. YEAR, 3. FRS ID, 10. BIA, 12. LATITUDE, 13. LONGITUDE, 22. INDU...
## lgl (7): 24. PRIMARY SIC, 25. SIC 2, 26. SIC 3, 27. SIC 4, 28. SIC 5, 29. S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
toxins_clean <- toxins_2023 %>%
select(-c(24:29)) %>%
mutate(`45. METAL CATEGORY` = ifelse(`45. METAL CATEGORY` == "Metal complound categories", "Metal compound categories",`45. METAL CATEGORY` ))
# Load required libraries
library(dplyr)
library(janitor)
library(ggplot2)
library(tidyr)
library(reshape2)
library(stringr)
library(scales)
# Clean all column names and address multiple patterns like 'x#_', '5_5_1_', and '6_2_m79'
toxins_named <- toxins_2023 %>%
clean_names() %>%
rename_with(~ gsub("^x\\d+_", "", .), everything()) %>% # Remove 'x#_' prefixes
rename_with(~ gsub("^\\d+(_\\d+)*_", "", .), everything()) # Remove leading numeric patterns like '5_5_1_' or '6_2_'
toxins_named <- toxins_named %>%
mutate(chemical = case_when(
str_detect(chemical, "Chromium and Chromium Compounds") ~ "Chromium and Chromium Compounds",
str_detect(chemical, "Sulfuric acid") ~ "Sulfuric acid",
str_detect(chemical, "Nitrate compounds") ~ "Nitrate compounds",
str_detect(chemical, "Barium compounds") ~ "Barium compounds (except for barium sulfate)",
TRUE ~ chemical
))
# Now proceed with your summary and plotting code
chem_release_summary_haz <- toxins_named %>%
filter(clean_air_act_chemical == "YES") %>%
group_by(chemical) %>%
summarise(total_releases = sum(total_releases, na.rm = TRUE)) %>%
arrange(desc(total_releases)) %>%
head(10)
# For hazardous toxins (p1)
p1 <- ggplot(chem_release_summary_haz, aes(x = reorder(chemical, total_releases), y = total_releases)) +
geom_segment(aes(xend = chemical, yend = 0), color = "skyblue") +
geom_point(size = 4, color = "darkblue") +
coord_flip() +
labs(title = "Top 10 Hazardous Toxins Released",
y = "Total Release",
x = "Chemical") +
theme_minimal() +
theme(
axis.text.y = element_text(angle = 0, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid.major.y = element_blank()
) +
scale_y_continuous(labels = scales::comma)
# Now proceed with your summary and plotting code
chem_release_summary_non <- toxins_named %>%
filter(clean_air_act_chemical == "NO") %>%
group_by(chemical) %>%
summarise(total_releases = sum(total_releases, na.rm = TRUE)) %>%
arrange(desc(total_releases)) %>%
head(10)
# For non-hazardous toxins (p2)
p2 <- ggplot(chem_release_summary_non, aes(x = reorder(chemical, total_releases), y = total_releases)) +
geom_segment(aes(xend = chemical, yend = 0), color = "lightgreen") +
geom_point(size = 4, color = "darkgreen") +
coord_flip() +
labs(title = "Top 10 Non-Hazardous Toxins Released",
y = "Total Release",
x = "Chemical") +
theme_minimal() +
theme(
axis.text.y = element_text(angle = 0, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid.major.y = element_blank()
) +
scale_y_continuous(labels = scales::comma)
# Summarize release amounts by classification
classification_summary <- toxins_named %>%
group_by(classification) %>%
summarise(total_release = sum(total_releases, na.rm = TRUE)) %>%
arrange(desc(total_release))
# Plot for Classification (Top 10)
p3 <- classification_summary %>%
top_n(10, total_release) %>%
ggplot(aes(x = reorder(classification, total_release), y = total_release, fill = classification)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Toxins Released by Classification",
y = "Total Release",
x = "Classification") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma)
# Pivot data to gather different release methods
release_methods <- toxins_named %>%
select(chemical, fugitive_air, stack_air, water,
underground_cl_i, underground_c_ii_v,
`1a_rcra_c_landfill`, `1b_other_landfills`,
other_disposal) %>%
pivot_longer(cols = fugitive_air:other_disposal,
names_to = "release_method",
values_to = "amount") %>%
group_by(release_method) %>%
summarise(total_amount = sum(amount, na.rm = TRUE))
# Rename release methods for better readability
release_methods <- release_methods %>%
mutate(release_method = case_when(
release_method == "fugitive_air" ~ "Fugitive Air",
release_method == "stack_air" ~ "Stack Air",
release_method == "water" ~ "Water",
release_method == "underground_cl_i" ~ "Underground Class I",
release_method == "underground_c_ii_v" ~ "Underground Class II-V",
release_method == "1a_rcra_c_landfill" ~ "RCRA C Landfill",
release_method == "1b_other_landfills" ~ "Other Landfills",
release_method == "other_disposal" ~ "Other Disposal",
TRUE ~ release_method
))
# Plot for Different Release Methods
p4 <- ggplot(release_methods, aes(x = reorder(release_method, total_amount), y = total_amount, fill = release_method)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Amounts Released by Different Onsite Methods",
x = "Release Method",
y = "Total Amount") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma) +
scale_fill_brewer(palette = "Set3")
# List of offsite release methods (excluding total)
offsite_methods <- c("m10", "m41", "m62", "m81", "m82",
"m66", "m67", "m64", "m65", "m73", "m79",
"m90", "m94", "m99")
# Pivot data to gather different offsite release methods
offsite_release_methods <- toxins_named %>%
select(all_of(offsite_methods)) %>%
pivot_longer(cols = everything(),
names_to = "release_method",
values_to = "amount") %>%
group_by(release_method) %>%
summarise(total_amount = sum(amount, na.rm = TRUE))
# Rename release methods for better readability
offsite_release_methods <- offsite_release_methods %>%
mutate(release_method = case_when(
release_method == "m10" ~ "Storage Only",
release_method == "m41" ~ "Solidification/Stabilization",
release_method == "m62" ~ "Wastewater Treatment",
release_method == "m81" ~ "Landfill/Disposal Surface Impoundment",
release_method == "m82" ~ "Land Treatment",
release_method == "m66" ~ "Other Waste Treatment",
release_method == "m67" ~ "Other Waste Treatment",
release_method == "m64" ~ "Other Disposal",
release_method == "m65" ~ "Other Disposal",
release_method == "m73" ~ "Land Treatment",
release_method == "m79" ~ "Other Land Disposal",
release_method == "m90" ~ "Other Off-Site Management",
release_method == "m94" ~ "Transfer to Waste Broker",
release_method == "m99" ~ "Unknown",
TRUE ~ release_method
))
# Plot for Different Offsite Release Methods
p5 <- ggplot(offsite_release_methods, aes(x = reorder(release_method, total_amount), y = total_amount, fill = release_method)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Amounts Released by Different Offsite Methods",
x = "Release Method",
y = "Total Amount") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
axis.text.y = element_text(size = 8)) + # Smaller text for y-axis labels
scale_y_continuous(labels = comma) +
scale_fill_brewer(palette = "Set3")
# Calculate proportions for treated, recycled, etc.
proportions <- toxins_named %>%
summarise(
total_treated = sum(treatment_on_site + treatment_off_site, na.rm = TRUE),
total_recycled = sum(recycling_on_site + recycling_off_sit, na.rm = TRUE),
total_energy_recovery = sum(energy_recover_on + energy_recover_of, na.rm = TRUE),
total_release = sum(total_releases, na.rm = TRUE)
) %>%
pivot_longer(everything(), names_to = "category", values_to = "amount") %>%
mutate(proportion = amount / sum(amount),
category = case_when(
category == "total_treated" ~ "Treated",
category == "total_recycled" ~ "Recycled",
category == "total_energy_recovery" ~ "Energy Recovery",
category == "total_release" ~ "Released",
TRUE ~ category
))
# Plot for Proportions
p6 <- ggplot(proportions, aes(x = reorder(category, -proportion), y = proportion, fill = category)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(proportion, accuracy = 0.1)),
position = position_stack(vjust = 0.5)) +
labs(title = "Proportion of Toxins Treated, Recycled, Recovered, and Released",
x = "Category",
y = "Proportion") +
theme_minimal() +
theme(legend.position = "none") +
scale_y_continuous(labels = scales::percent) +
scale_fill_brewer(palette = "Set2")
# Calculate on-site and off-site releases
onsite_offsite_summary <- toxins_named %>%
summarise(
on_site_release = sum(on_site_release_total, na.rm = TRUE),
off_site_release = sum(off_site_release_total, na.rm = TRUE)
) %>%
pivot_longer(everything(), names_to = "release_type", values_to = "amount") %>%
mutate(release_type = case_when(
release_type == "on_site_release" ~ "On-Site Release",
release_type == "off_site_release" ~ "Off-Site Release",
TRUE ~ release_type
))
# Calculate percentages
total_release <- sum(onsite_offsite_summary$amount)
onsite_offsite_summary <- onsite_offsite_summary %>%
mutate(percentage = amount / total_release * 100)
# Plot for On-Site vs Off-Site Releases
p7 <- ggplot(onsite_offsite_summary, aes(x = release_type, y = amount, fill = release_type)) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = paste0(comma(amount), "\n(", round(percentage, 1), "%)")),
position = position_stack(vjust = 0.5),
size = 4) +
labs(title = "Comparison of On-Site vs Off-Site Releases",
x = NULL, # Remove x-axis label as it's redundant
y = "Total Amount",
fill = "Release Type") +
theme_minimal() +
theme(legend.position = "none", # Remove legend as it's redundant
axis.text.x = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")) +
scale_y_continuous(labels = comma) +
scale_fill_brewer(palette = "Set2")
ggplotly(p1)
ggplotly(p2)
ggplotly(p3)
ggplotly(p4)
ggplotly(p5)
ggplotly(p6)
ggplotly(p7)
# Data preparation for on-site
toxins_clustered_on_site <- toxins_2023 %>%
select(c(51:53, 55, 56, 58:60, 62:65)) %>% # Select specified columns
na.omit() %>% # Remove rows with missing values
filter(.[[ncol(.)]] != 0) # Remove rows where column 65 has a value of 0
# Scaling the data
toxins_clustered_on_site_scaled <- scale(toxins_clustered_on_site)
# Data preparation for off-site
toxins_clustered_off_site <- toxins_2023 %>%
select(c(66:71, 75, 76, 79:88)) %>% # Select specified columns
na.omit() %>% # Remove rows with missing values
filter(.[[ncol(.)]] != 0) # Remove rows where column 88 has a value of 0
# Scaling the data
toxins_clustered_off_site_scaled <- scale(toxins_clustered_off_site)
sample_size <- 10000
# For on-site data
random_sample_on_site <- toxins_clustered_on_site_scaled[sample(1:nrow(toxins_clustered_on_site_scaled), sample_size), ]
# For off-site data
random_sample_off_site <- toxins_clustered_off_site_scaled[sample(1:nrow(toxins_clustered_off_site_scaled), sample_size), ]
# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_on_site, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for On-site Data")
# Off-site data
fviz_nbclust(random_sample_off_site, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Off-site Data")
# Clustering with 2 clusters for on-site
kmeans_on_site <- kmeans(toxins_clustered_on_site_scaled, centers = 2)
kmeans_on_site$centers
## 51. 5.1 - FUGITIVE AIR 52. 5.2 - STACK AIR 53. 5.3 - WATER
## 1 1.263709e-01 -8.520503e-02 -3.475093e-02
## 2 -6.806698e-06 4.589387e-06 1.871785e-06
## 55. 5.4.1 - UNDERGROUND CL I 56. 5.4.2 - UNDERGROUND C II-V
## 1 -2.606665e-02 -8.729126e-03
## 2 1.404024e-06 4.701757e-07
## 58. 5.5.1A - RCRA C LANDFILL 59. 5.5.1B - OTHER LANDFILLS
## 1 -3.235243e-02 -4.222658e-02
## 2 1.742595e-06 2.274444e-06
## 60. 5.5.2 - LAND TREATMENT 62. 5.5.3A - RCRA SURFACE IM
## 1 -1.793947e-02 -4.484244e-03
## 2 9.662714e-07 2.415342e-07
## 63. 5.5.3B - OTHER SURFACE I 64. 5.5.4 - OTHER DISPOSAL
## 1 105.530269207 129.879566229
## 2 -0.005684163 -0.006995686
## 65. ON-SITE RELEASE TOTAL
## 1 125.908039405
## 2 -0.006781768
kmeans_on_site$size
## [1] 3 55697
kmeans_on_site$tot.withinss
## [1] 536806.3
# Display the ratio of between_SS to total_SS
between_tot_ratio_on <- kmeans_on_site$betweenss / kmeans_on_site$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_on, 1), "%\n")
## Between_SS / Total_SS = 19.7 %
fviz_cluster(kmeans_on_site, data = toxins_clustered_on_site_scaled,
geom = "point", ellipse.type = "norm",
ggtheme = theme_minimal())
## Too few points to calculate an ellipse
# Clustering with 6 clusters for off-site
kmeans_off_site <- kmeans(toxins_clustered_off_site_scaled, centers = 2)
kmeans_off_site$centers
## 66. 6.1 - POTW - TRNS RLSE 67. 6.1 - POTW - TRNS TRT
## 1 8.71454390 10.68526884
## 2 -0.01672721 -0.02050994
## 68. POTW - TOTAL TRANSFERS 69. 6.2 - M10 70. 6.2 - M41 71. 6.2 - M62
## 1 11.23359393 -4.296281e-02 3.616476185 -0.02662729
## 2 -0.02156243 8.246536e-05 -0.006941679 0.00005111
## 75. 6.2 - M81 76. 6.2 - M82 79. 6.2 - M66 80. 6.2 - M67 81. 6.2 - M64
## 1 2.478007849 5.3915696 -7.387454e-03 -1.828471e-02 11.78490981
## 2 -0.004756436 -0.0103489 1.417992e-05 3.509676e-05 -0.02262066
## 82. 6.2 - M65 83. 6.2 - M73 84. 6.2 - M79 85. 6.2 - M90 86. 6.2 - M94
## 1 4.142540154 -2.391045e-02 -3.897248e-02 4.269182354 -3.737444e-02
## 2 -0.007951438 4.589514e-05 7.480609e-05 -0.008194523 7.173872e-05
## 87. 6.2 - M99 88. OFF-SITE RELEASE TOTAL
## 1 -4.438416e-02 14.73970989
## 2 8.519359e-05 -0.02829228
kmeans_off_site$size
## [1] 50 26049
kmeans_off_site$tot.withinss
## [1] 431884.5
# Display the ratio of between_SS to total_SS
between_tot_ratio_off <- kmeans_off_site$betweenss / kmeans_off_site$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_off, 1), "%\n")
## Between_SS / Total_SS = 8.1 %
fviz_cluster(kmeans_off_site, data = toxins_clustered_off_site_scaled,
geom = "point", ellipse.type = "norm",
ggtheme = theme_minimal())
# Data preparation for total
toxins_clustered_total <- toxins_2023 %>%
select(c(65, 88, 94, 113, 116, 119, 107, 106, 120)) %>%
na.omit()
# Scaling the data
toxins_clustered_total_scaled <- scale(toxins_clustered_total)
# For total data
random_sample_total <- toxins_clustered_total_scaled[sample(1:nrow(toxins_clustered_total_scaled), sample_size), ]
# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_total, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Total Data")
# Clustering with 2 clusters for on-site
kmeans_total <- kmeans(toxins_clustered_total_scaled, centers = 2)
kmeans_total$centers
## 65. ON-SITE RELEASE TOTAL 88. OFF-SITE RELEASE TOTAL
## 1 -0.01578838 -0.01207179
## 2 10.00797559 7.65209425
## 94. OFF-SITE RECYCLED TOTAL 113. 8.2 - ENERGY RECOVER ON
## 1 -0.01913677 -0.01362149
## 2 12.13046288 8.63442099
## 116. 8.5 - RECYCLING OFF SIT 119. PRODUCTION WSTE (8.1-8.7)
## 1 -0.01919015 -0.02149019
## 2 12.16429663 13.62225482
## 107. TOTAL RELEASES 106. 6.2 - TOTAL TRANSFER 120. 8.8 - ONE-TIME RELEASE
## 1 -0.01640607 -0.01579178 -0.01081905
## 2 10.39951763 10.01012788 6.85800245
kmeans_total$size
## [1] 10776 17
kmeans_total$tot.withinss
## [1] 80623.29
# Display the ratio of between_SS to total_SS
between_tot_ratio_total <- kmeans_total$betweenss / kmeans_total$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_total, 1), "%\n")
## Between_SS / Total_SS = 17 %
fviz_cluster(kmeans_total, data = toxins_clustered_total_scaled,
geom = "point", ellipse.type = "norm",
ggtheme = theme_minimal())
# Perform PCA on on-site scaled data
pca_on_site <- prcomp(toxins_clustered_on_site_scaled, center = TRUE, scale. = TRUE)
# Summary of PCA to see variance explained by each principal component
summary(pca_on_site)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.6531 1.05258 1.01878 1.00567 1.0010 1.00010 0.99999
## Proportion of Variance 0.2277 0.09233 0.08649 0.08428 0.0835 0.08335 0.08333
## Cumulative Proportion 0.2277 0.32005 0.40655 0.49083 0.5743 0.65768 0.74101
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.99879 0.98111 0.94517 0.50430 2.959e-11
## Proportion of Variance 0.08313 0.08021 0.07445 0.02119 0.000e+00
## Cumulative Proportion 0.82415 0.90436 0.97881 1.00000 1.000e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_on_site)
# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_on_site,
col.var = "contrib", # color based on contributions of variables
gradient.cols = c("blue", "green", "red"),
repel = TRUE) # repel text to avoid overlap
# Perform PCA on off-site scaled data
pca_off_site <- prcomp(toxins_clustered_off_site_scaled, center = TRUE, scale. = TRUE)
# Summary of PCA to see variance explained by each principal component
summary(pca_off_site)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5712 1.4361 1.11480 1.00952 1.00709 1.00164 1.00055
## Proportion of Variance 0.1371 0.1146 0.06904 0.05662 0.05635 0.05574 0.05562
## Cumulative Proportion 0.1371 0.2517 0.32077 0.37738 0.43373 0.48947 0.54509
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.00017 1.00006 1.00004 0.9986 0.99842 0.99203 0.99092
## Proportion of Variance 0.05557 0.05556 0.05556 0.0554 0.05538 0.05467 0.05455
## Cumulative Proportion 0.60066 0.65622 0.71178 0.7672 0.82257 0.87724 0.93179
## PC15 PC16 PC17 PC18
## Standard deviation 0.78911 0.77785 3.108e-10 1.812e-10
## Proportion of Variance 0.03459 0.03361 0.000e+00 0.000e+00
## Cumulative Proportion 0.96639 1.00000 1.000e+00 1.000e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_off_site)
# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_off_site,
col.var = "contrib", # color based on contributions of variables
gradient.cols = c("blue", "green", "red"),
repel = TRUE) # repel text to avoid overlap
# Perform PCA on off-site scaled data
pca_total <- prcomp(toxins_clustered_total_scaled, center = TRUE, scale. = TRUE)
# Summary of PCA to see variance explained by each principal component
summary(pca_total)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5130 1.4381 1.3773 1.1816 0.9213 0.61822 0.34114
## Proportion of Variance 0.2544 0.2298 0.2108 0.1551 0.0943 0.04247 0.01293
## Cumulative Proportion 0.2544 0.4841 0.6949 0.8500 0.9443 0.98678 0.99971
## PC8 PC9
## Standard deviation 0.05079 1.9e-11
## Proportion of Variance 0.00029 0.0e+00
## Cumulative Proportion 1.00000 1.0e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_total)
# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_total,
col.var = "contrib", # color based on contributions of variables
gradient.cols = c("blue", "green", "red"),
repel = TRUE) # repel text to avoid overlap
# Step 1: Extract the first 2 PCs for on-site and off-site data
pca_scores_on_site <- pca_on_site$x[, 1:2] # First 2 PCs for On-Site
pca_scores_off_site <- pca_off_site$x[, 1:2] # First 2 PCs for Off-Site
pca_score_total <- pca_total$x[,1:2]
# Step 2: Combine PCA scores with the original data
pca_on_site <- cbind(toxins_clustered_on_site, pca_scores_on_site)
pca_off_site <- cbind(toxins_clustered_off_site, pca_scores_off_site)
pca_total <- cbind(toxins_clustered_total, pca_score_total)
# Step 3: Scale PCA scores and the original data
pca_on_site_scaled <- scale(pca_on_site[, (ncol(toxins_clustered_on_site) + 1):(ncol(toxins_clustered_on_site) + 2)]) # Scale PCA scores
pca_off_site_scaled <- scale(pca_off_site[, (ncol(toxins_clustered_off_site) + 1):(ncol(toxins_clustered_off_site) + 2)]) # Scale PCA scores
pca_total_scaled <- scale(pca_total[, (ncol(toxins_clustered_total) + 1):(ncol(toxins_clustered_total) + 2)])
# Step 4: Run k-means clustering for On-Site data (assume 6 clusters for demonstration)
k_on_site <- kmeans(pca_on_site[, ncol(pca_on_site)-1:ncol(pca_on_site)], centers = 2, nstart = 25)
# Step 5: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (On-Site):", k_on_site$betweenss / k_on_site$totss, "\n")
## Proportion of variance explained (On-Site): 0.8206545
# Step 6: Visualize clusters for On-Site data
fviz_cluster(list(data = pca_on_site[, ncol(pca_on_site)-1:ncol(pca_on_site)], cluster = k_on_site$cluster),
geom = "point",
ellipse.type = "convex",
show.clust.cent = TRUE,
main = "Clusters from PCA (On-Site Data)")
random_sample_on_site_pca <- pca_on_site_scaled[sample(1:nrow(pca_on_site_scaled), sample_size), ]
# For off-site data
random_sample_off_site_pca <- pca_off_site_scaled[sample(1:nrow(pca_off_site_scaled), sample_size), ]
random_sample_total_pca <- pca_total_scaled[sample(1:nrow(pca_total_scaled), sample_size), ]
# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_on_site_pca, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for On-site Data")
# Off-site data
fviz_nbclust(random_sample_off_site_pca, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Off-site Data")
# Step 7: Run k-means clustering for Off-Site data (assume 6 clusters for demonstration)
k_off_site <- kmeans(pca_off_site[, ncol(pca_off_site)-1:ncol(pca_off_site)], centers = 2, nstart = 25)
# Step 8: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (Off-Site):", k_off_site$betweenss / k_off_site$totss, "\n")
## Proportion of variance explained (Off-Site): 0.2787647
# Step 9: Visualize clusters for Off-Site data
fviz_cluster(list(data = pca_off_site[, ncol(pca_off_site)-1:ncol(pca_off_site)], cluster = k_off_site$cluster),
geom = "point",
ellipse.type = "convex",
show.clust.cent = TRUE,
main = "Clusters from PCA (Off-Site Data)")
# Total data
fviz_nbclust(random_sample_total_pca, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Total Data")
# Step 7: Run k-means clustering for Off-Site data (assume 6 clusters for demonstration)
k_total <- kmeans(pca_total[, ncol(pca_total)-1:ncol(pca_total)], centers = 2, nstart = 25)
# Step 8: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (Total):", k_total$betweenss / k_total$totss, "\n")
## Proportion of variance explained (Total): 0.5246632
# Step 9: Visualize clusters for Off-Site data
fviz_cluster(list(data = pca_total[, ncol(pca_total)-1:ncol(pca_total)], cluster = k_total$cluster),
geom = "point",
ellipse.type = "convex",
show.clust.cent = TRUE,
main = "Clusters from PCA (Total Data)")
Totals are very strong predictors.
# Step 1: Load necessary libraries
library(FactoMineR)
library(factoextra)
# Step 2: Prepare the data
mfa_toxins <- toxins_2023
# Step 3: Select variables for MFA (replace indices with appropriate column numbers)
MFAData <- toxins_2023[, c(37, 23, 46, 8, 44, 65, 88, 94, 106, 107, 119)]
# Step 4: Convert categorical variables to factors and numerical to numeric
# Adjust column indices based on the dataset
categorical_cols <- c(1, 2, 3, 4, 5) # Example indices for categorical variables
numerical_cols <- c(6, 7, 8, 9, 10, 11) # Example indices for numerical variables
MFAData[categorical_cols] <- lapply(MFAData[categorical_cols], as.factor)
MFAData[numerical_cols] <- lapply(MFAData[numerical_cols], as.numeric)
# Step 5: Remove rows with missing values
MFAData <- na.omit(MFAData)
# Step 6: Random sampling if dataset is too large
sample_size <- 1000 # Define the desired sample size
if (nrow(MFAData) > sample_size) {
set.seed(123) # Ensure reproducibility
MFAData <- MFAData[sample(1:nrow(MFAData), sample_size), ]
}
# Step 7: Define groups and types for each variable
group_sizes <- rep(1, ncol(MFAData)) # Each variable is its own group
types <- c(rep("n", length(categorical_cols)), rep("s", length(numerical_cols))) # Types for each variable
# Step 8: Run MFA
MFA1 <- MFA(
MFAData,
group = group_sizes, # Each variable treated as a separate group
type = types, # Specify type for each variable
name.group = colnames(MFAData), # Use column names for group names
graph = FALSE # Do not automatically plot graphs
)
# Step 9: Visualizations
# Scree plot: variance explained by dimensions
fviz_screeplot(MFA1, addlabels = TRUE, ylim = c(0, 50))
# Variable contributions to the dimensions
fviz_mfa_var(MFA1,
repel = TRUE, # Avoid label overlap
col.var = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))
# Individuals (data points) representation
fviz_mfa_ind(MFA1,
repel = TRUE, # Avoid label overlap
geom = "point",
col.ind = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))
# Variable contributions to the dimensions (by group)
fviz_mfa_var(MFA1, "group",
repel = TRUE, # Avoid label overlap
col.var = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))
# Quantitative variables
fviz_mfa_var(MFA1,
choice = "quanti.var", # For quantitative variables
repel = TRUE, # Avoid label overlap
col.var = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))
In the last year, Lead and Lead Compounds were released over 3x as much as any other toxic chemical
By classification, TRI is the most common toxic chemical released in the US
The most common disposal method for On-Site release is “Other Disposal”
Two thirds of toxic chemicals reported were recycled last year, while only 10% were released into the environment
Cluster sizes were very uneven across all results
Clustering facilities based on Principle Components of their On-Site release amounts proved to be much better than using Off-Site release or Total Release
MFA did not provide much explainability among clusters
With different combinations of variables and cluster numbers, it may be possible to improve
There is potential for legislation to be based on these clusters, potentially assigning penalties or strict regulations on facilities that fall within certain clusters
Displays many different metrics to aid in analysis
Could be useful to both professionals and those outside of the environmental field for research or monitoring purposes
Important Find:
Our GitHub Repo: https://github.com/aidanfred24/STAT-442-Final-Project?tab=readme-ov-file
Link to data: https://www.epa.gov/toxics-release-inventory-tri-program/tri-basic-data-files-calendar-years-1987-present